學了那麼多監督式學習的方法,
這篇就來總整理一下,
這樣比較好實作出結果。
讀取CSV格式
data_path = '/Users/maylin/Documents/Data/Midterm/'
df_train = pd.read_csv(data_path + 'train_data.csv')
df_test = pd.read_csv(data_path + 'test_data.csv')
其他各種讀檔方式可以參考Day005 Sample Code
data_src['人口'].max()
app_train.loc[0:10000]
df.dtypes
df_train.shape
app_train.dtypes.value_counts()
app_train.select_dtypes(include=["object"]).apply(pd.Series.nunique, axis = 0)
Label encoding 和 One Hot encoding 可以參考Day006 Sample Code
以下參考Day013的Sample Code
result = pd.concat([df1, df2, df3])
result = pd.concat([df1, df4], axis = 1)
result = pd.concat([df1, df4], axis = 1, join = 'inner') # 硬串接
result = pd.merge(df1, df4, how='inner')
print(df1)
df1.melt()
sub_df = app_train[app_train['TARGET'] == 1]
sub_df.head()
sub_df = app_train.loc[app_train['AMT_INCOME_TOTAL'] > app_train['AMT_INCOME_TOTAL'].mean(), ['SK_ID_CURR', 'TARGET']]
sub_df.head()
app_train.groupby(['NAME_CONTRACT_TYPE']).size()
app_train.groupby(['NAME_CONTRACT_TYPE'])['AMT_INCOME_TOTAL'].describe()
cut_rule = [-np.inf, 0, 2, 5, np.inf]
app_train['CNT_CHILDREN_GROUP'] = pd.cut(app_train['CNT_CHILDREN'].values, cut_rule, include_lowest=True)
app_train['CNT_CHILDREN_GROUP'].value_counts()
grp = ['CNT_CHILDREN_GROUP', 'TARGET']
grouped_df = app_train.groupby(grp)['AMT_INCOME_TOTAL']
grouped_df.mean()
參考Day007 的Sample Code
# 訓練資料需要 train_X, train_Y / 預測輸出需要 ids(識別每個預測值), test_X
# 在此先抽離出 train_Y 與 ids, 而先將 train_X, test_X 該有的資料合併成 df, 先作特徵工程
train_Y = np.log1p(df_train['SalePrice'])
ids = df_test['Id']
df_train = df_train.drop(['Id', 'SalePrice'] , axis=1)
df_test = df_test.drop(['Id'] , axis=1)
df = pd.concat([df_train,df_test])
df.head()
# 秀出資料欄位的類型, 與對應的數量
dtype_df = df.dtypes.reset_index()
dtype_df.columns = ["Count", "Column Type"]
dtype_df = dtype_df.groupby("Column Type").aggregate('count').reset_index()
dtype_df
# 確定只有 int64, float64, object 三種類型後對欄位名稱執行迴圈, 分別將欄位名稱存於三個 list 中
int_features = []
float_features = []
object_features = []
# .dtypes(欄位類型), .columns(欄位名稱) 是 DataFrame 提供的兩個方法, 這裡順便展示一下 for 與 zip 搭配的用法
for dtype, feature in zip(df.dtypes, df.columns):
if dtype == 'float64':
float_features.append(feature)
elif dtype == 'int64':
int_features.append(feature)
else:
object_features.append(feature)
# 這邊採用的寫法稱為 f-string, 是 Python 3.6.2 以後版本才出現的
# 如果無法執行, 則需要更新到這個版本之後, 或自行將程式改寫為 str.format 形式
# 改寫方式可以參考 https://blog.louie.lu/2017/08/08/outdate-python-string-format-and-fstring/
print(f'{len(int_features)} Integer Features : {int_features}\n')
print(f'{len(float_features)} Float Features : {float_features}\n')
print(f'{len(object_features)} Object Features : {object_features}')
參考Day007 Ans Code
整數 (int) 特徵取平均 (mean)
df[int_features].mean()
整數 (int) 特徵取最大值 (max)
df[int_features].max()
整數 (int) 特徵取相異值 (nunique)
df[int_features].nunique()
類別 (object) 特徵取相異值 (nunique)
df[object_features].nunique()
參考Day008 Ans Code
看此欄位的count, mean, std, min, 25%, 50%, 75%, max
app_train['AMT_INCOME_TOTAL'].describe()
針對此欄位畫出直方圖
app_train['AMT_INCOME_TOTAL'].hist()
plt.xlabel('AMT_INCOME_TOTAL')
注意到該欄位的最大值和 75% 百分位數的值有異常大的差距,
所以直接畫直方圖會看不出所以然來,可以先過濾掉再重新畫圖來看
app_train.loc[app_train['AMT_INCOME_TOTAL']<app_train['AMT_INCOME_TOTAL'].quantile(0.99)]['AMT_INCOME_TOTAL'].hist()
plt.xlabel('AMT_INCOME_TOTAL')
需要每個欄位一個一個看
參考Day011的Sample Answer Code
# 2.1 將 NAs 以 q50 填補
print("Before replace NAs, numbers of row that AMT_ANNUITY is NAs: %i" % sum(app_train['AMT_ANNUITY'].isnull()))
q_50 = np.percentile(app_train[~app_train['AMT_ANNUITY'].isnull()]['AMT_ANNUITY'], 50)
app_train.loc[app_train['AMT_ANNUITY'].isnull(),'AMT_ANNUITY'] = q_50
print("After replace NAs, numbers of row that AMT_ANNUITY is NAs: %i" % sum(app_train['AMT_ANNUITY'].isnull()))
# 3
print("Before replace NAs, numbers of row that AMT_GOODS_PRICE is NAs: %i" % sum(app_train['AMT_GOODS_PRICE'].isnull()))
# 列出重複最多的數值
print(app_train['AMT_GOODS_PRICE'].value_counts().head())
mode_goods_price = list(app_train['AMT_GOODS_PRICE'].value_counts().index)
app_train.loc[app_train['AMT_GOODS_PRICE'].isnull(), 'AMT_GOODS_PRICE'] = mode_goods_price[0]
print("After replace NAs, numbers of row that AMT_GOODS_PRICE is NAs: %i" % sum(app_train['AMT_GOODS_PRICE'].isnull()))
參考Day011的Sample Code
# 以 AMT_CREDIT 為例
app_train['AMT_CREDIT'].hist(bins = 50)
plt.title("Original")
plt.show()
value = app_train['AMT_CREDIT'].values
app_train['AMT_CREDIT_Norm1'] = ( value - np.mean(value) ) / ( np.std(value) )
app_train['AMT_CREDIT_Norm1'].hist(bins = 50)
plt.title("Normalized with Z-transform")
plt.show()
app_train['AMT_CREDIT_Norm2'] = ( value - min(value) ) / ( max(value) - min(value) )
app_train['AMT_CREDIT_Norm2'].hist(bins = 50)
plt.title("Normalized to 0 ~ 1")
plt.show()
參考Day011的Sample Answer Code
# 2.2 Normalize values to -1 to 1
print("== Original data range ==")
print(app_train['AMT_ANNUITY'].describe())
def normalize_value(x):
x = (( (x - min(x)) / ( max(x) - min(x) ) ) - 0.5) * 2
return x
app_train['AMT_ANNUITY_NORMALIZED'] = normalize_value(app_train['AMT_ANNUITY'])
print("== Normalized data range ==")
app_train['AMT_ANNUITY_NORMALIZED'].describe()
參考Day012的Sample Code
# 空值補 -1, 做線性迴歸
df_m1 = df.fillna(-1)
train_X = df_m1[:train_num]
estimator = LinearRegression()
cross_val_score(estimator, train_X, train_Y, cv=5).mean()
# 空值補 0, 做線性迴歸
df_0 = df.fillna(0)
train_X = df_0[:train_num]
estimator = LinearRegression()
cross_val_score(estimator, train_X, train_Y, cv=5).mean()
# 空值補平均值, 做線性迴歸
df_mn = df.fillna(df.mean())
train_X = df_mn[:train_num]
estimator = LinearRegression()
cross_val_score(estimator, train_X, train_Y, cv=5).mean()
# 空值補 -1, 搭配最大最小化
df = df.fillna(-1)
df_temp = MinMaxScaler().fit_transform(df)
train_X = df_temp[:train_num]
estimator = LinearRegression()
cross_val_score(estimator, train_X, train_Y, cv=5).mean()
# 空值補 -1,搭配標準化
df = df.fillna(-1)
df_temp = StandardScaler().fit_transform(df)
train_X = df_temp[:train_num]
estimator = LinearRegression()
cross_val_score(estimator, train_X, train_Y, cv=5).mean()
outlier 是指一個特徵中,有一個數值,會影響到全部數值的分佈
可以參考Day009的Sample Code,看看每個特徵值,來找出異常值(outlier)
# DAYS_EMPLOYED: 申請貸款前,申請人已在現職工作的時間
(app_train['DAYS_EMPLOYED'] / 365).describe()
plt.hist(app_train['DAYS_EMPLOYED'])
plt.show()
app_train['DAYS_EMPLOYED'].value_counts()
透過這個直方圖,可以看到最右邊有一個離群值
# 這邊我們用 nan 將異常值取代
app_train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)
# 用新欄位標記大於50的值(outlier)
app_train['OWN_CAR_AGE_ANOM'] = app_train['OWN_CAR_AGE'] >= 50
參考Day010的Sample Code
# 找int64, float64的欄位
#只取 int64, float64 兩種數值型欄位, 存於 num_features 中
num_features = []
for dtype, feature in zip(df.dtypes, df.columns):
if dtype == 'float64' or dtype == 'int64':
num_features.append(feature)
print(f'{len(num_features)} Numeric Features : {num_features}\n')
# 削減文字型欄位, 只剩數值型欄位
df = df[num_features]
df = df.fillna(-1) # fillna 取代空值NaN為-1
MMEncoder = MinMaxScaler()
#sklearn的MinMaxScaler為将属性缩放到一个指定的最大和最小值(通常是1-0)之间,这样处理差非常小的属性增强其稳定性,也可维持稀疏矩阵中为0的条目。
df.head()
以下用三個情境比較去除離群值,再做線性迴歸比較分數
# 情境一:正常數值
# 顯示 欄位GrLivArea 與 目標值SalePrice 的散佈圖
import seaborn as sns
import matplotlib.pyplot as plt
sns.regplot(x = df['GrLivArea'], y=train_Y) #畫圖function
plt.show()
# 情境二:將 GrLivArea 限制在 800 到 2500 以內, SalesPrice未改
# 將 GrLivArea 限制在 800 到 2500 以內, 調整離群值
df['GrLivArea'] = df['GrLivArea'].clip(800, 2500)
sns.regplot(x = df['GrLivArea'], y=train_Y)
plt.show()
#情境三: GrLivArea 限制在 800 到 2500 以內, SalesPrice也更改
# 將 GrLivArea 限制在 800 到 2500 以內, 捨棄離群值
keep_indexs = (df['GrLivArea']> 800) & (df['GrLivArea']< 2500)
df = df[keep_indexs]
train_Y = train_Y[keep_indexs]
sns.regplot(x = df['GrLivArea'], y=train_Y)
plt.show()
以上,打完收工。